Conversation
| def __init__(self, hf_dir: str): | ||
| index_file = os.path.join(hf_dir, "model.safetensors.index.json") | ||
| config = AutoConfig.from_pretrained(hf_dir) | ||
| config = AutoConfig.from_pretrained(hf_dir, trust_remote_code=True) |
There was a problem hiding this comment.
we can't set trust_remove_code by default
| hf_model_path, trust_remote_code=trust_remote_code | ||
| ) | ||
|
|
||
| if hasattr(config, "num_nextn_predict_layers"): |
There was a problem hiding this comment.
why do we set this, is this a debug code?
There was a problem hiding this comment.
@ArronHZG This should not be needed once the patches here are applied #62 (comment)
| @@ -27,9 +27,17 @@ def init_distributed(): | |||
|
|
|||
| def load_model(hf_model_path, trust_remote_code=False): | |||
| """Load model""" | |||
| bridge = AutoBridge.from_pretrained( | |||
|
|
|||
| # use AutoConfig to change hf config | |||
| config = AutoConfig.from_pretrained( | |||
| hf_model_path, trust_remote_code=trust_remote_code | |||
| ) | |||
|
|
|||
| if hasattr(config, "num_nextn_predict_layers"): | |||
| config.num_nextn_predict_layers = 0 | |||
|
|
|||
| bridge = AutoBridge.from_config(config) | |||
|
|
|||
There was a problem hiding this comment.
These changes should not be needed
There was a problem hiding this comment.
Should MTP be added to the training? My understanding is that it should still be left to the user to decide.
There was a problem hiding this comment.
Should MTP be added to the training? My understanding is that it should still be left to the user to decide.
Ah, if it's for this purpose, I think it's better to remove this line instead as this flag controls directly if MTP is enabled on the Megatron side:
mbridge/mbridge/models/mimo.py
Line 29 in ed7d432
| # Handle transformer components within MTP | ||
| # Check if this is a transformer_layer component | ||
| if "transformer_layer" in name: | ||
| # Create a proxy name to use with parent class methods | ||
| # Convert mtp.layers.{idx}.transformer_layer.* to decoder.layers.{idx}.* | ||
| proxy_name = name.replace( | ||
| f"mtp.layers.{mtp_layer_idx}.transformer_layer", | ||
| f"decoder.layers.{mtp_layer_idx}", | ||
| ) |
There was a problem hiding this comment.
Need these changes (replace from Line 84 to Line 92 with these code) so that we don't need to disable num_nextn_predict_layers when loading from hf weights (so that MTP weights can be loaded correctly)
| # Handle transformer components within MTP. MCore may expose these under | |
| # either "...transformer_layer.*" or "...mtp_model_layer.*". | |
| layer_prefixes = ("transformer_layer", "mtp_model_layer") | |
| proxy_name = None | |
| for layer_prefix in layer_prefixes: | |
| mcore_prefix = f"mtp.layers.{mtp_layer_idx}.{layer_prefix}" | |
| if mcore_prefix in name: | |
| proxy_name = name.replace( | |
| mcore_prefix, | |
| f"decoder.layers.{mtp_layer_idx}", | |
| ) | |
| break | |
| if proxy_name is not None: |
There was a problem hiding this comment.
Should MTP be added to the training? My understanding is that it should still be left to the user to decide.
Used to pre-configure relevant MTP settings based on the configuration.